Clearing Global Environment
rm(list = ls())
Installing Packages
options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("tidyr")
##
## The downloaded binary packages are in
## /var/folders/y4/1qsgv9rs22gf6l_qrwz2tbf40000gn/T//RtmphdQhrw/downloaded_packages
install.packages("plotly")
##
## The downloaded binary packages are in
## /var/folders/y4/1qsgv9rs22gf6l_qrwz2tbf40000gn/T//RtmphdQhrw/downloaded_packages
Libraries
library("readr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
library(plotly)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Importing Data
calendar = read_csv('../data/calendar.csv')
## Rows: 1969 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): weekday, event_name_1, event_type_1, event_name_2, event_type_2
## dbl (7): wm_yr_wk, wday, month, year, snap_CA, snap_TX, snap_WI
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(calendar)
## # A tibble: 6 × 13
## date wm_yr_wk weekday wday month year event_name_1 event_type_1
## <date> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 2011-01-29 11101 Saturday 1 1 2011 <NA> <NA>
## 2 2011-01-30 11101 Sunday 2 1 2011 <NA> <NA>
## 3 2011-01-31 11101 Monday 3 1 2011 <NA> <NA>
## 4 2011-02-01 11101 Tuesday 4 2 2011 <NA> <NA>
## 5 2011-02-02 11101 Wednesday 5 2 2011 <NA> <NA>
## 6 2011-02-03 11101 Thursday 6 2 2011 <NA> <NA>
## # ℹ 5 more variables: event_name_2 <chr>, event_type_2 <chr>, snap_CA <dbl>,
## # snap_TX <dbl>, snap_WI <dbl>
sales_train_validation = read_csv('../data/sales_train_validation.csv')
## Rows: 30490 Columns: 1918
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): item_id, dept_id, cat_id, store_id, state_id
## dbl (1913): d_1, d_2, d_3, d_4, d_5, d_6, d_7, d_8, d_9, d_10, d_11, d_12, d...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sales_train_validation)
## # A tibble: 6 × 1,918
## item_id dept_id cat_id store_id state_id d_1 d_2 d_3 d_4 d_5 d_6
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 HOBBIES_… HOBBIE… HOBBI… CA_1 CA 0 0 0 0 0 0
## 2 HOBBIES_… HOBBIE… HOBBI… CA_1 CA 0 0 0 0 0 0
## 3 HOBBIES_… HOBBIE… HOBBI… CA_1 CA 0 0 0 0 0 0
## 4 HOBBIES_… HOBBIE… HOBBI… CA_1 CA 0 0 0 0 0 0
## 5 HOBBIES_… HOBBIE… HOBBI… CA_1 CA 0 0 0 0 0 0
## 6 HOBBIES_… HOBBIE… HOBBI… CA_1 CA 0 0 0 0 0 0
## # ℹ 1,907 more variables: d_7 <dbl>, d_8 <dbl>, d_9 <dbl>, d_10 <dbl>,
## # d_11 <dbl>, d_12 <dbl>, d_13 <dbl>, d_14 <dbl>, d_15 <dbl>, d_16 <dbl>,
## # d_17 <dbl>, d_18 <dbl>, d_19 <dbl>, d_20 <dbl>, d_21 <dbl>, d_22 <dbl>,
## # d_23 <dbl>, d_24 <dbl>, d_25 <dbl>, d_26 <dbl>, d_27 <dbl>, d_28 <dbl>,
## # d_29 <dbl>, d_30 <dbl>, d_31 <dbl>, d_32 <dbl>, d_33 <dbl>, d_34 <dbl>,
## # d_35 <dbl>, d_36 <dbl>, d_37 <dbl>, d_38 <dbl>, d_39 <dbl>, d_40 <dbl>,
## # d_41 <dbl>, d_42 <dbl>, d_43 <dbl>, d_44 <dbl>, d_45 <dbl>, d_46 <dbl>, …
sales_test_validation = read_csv('../data/sales_test_validation.csv')
## Rows: 30490 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): item_id, dept_id, cat_id, store_id, state_id
## dbl (28): d_1914, d_1915, d_1916, d_1917, d_1918, d_1919, d_1920, d_1921, d_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sales_test_validation)
## # A tibble: 6 × 33
## item_id dept_id cat_id store_id state_id d_1914 d_1915 d_1916 d_1917 d_1918
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 HOBBIES_1… HOBBIE… HOBBI… CA_1 CA 0 0 0 2 0
## 2 HOBBIES_1… HOBBIE… HOBBI… CA_1 CA 0 1 0 0 0
## 3 HOBBIES_1… HOBBIE… HOBBI… CA_1 CA 0 0 1 1 0
## 4 HOBBIES_1… HOBBIE… HOBBI… CA_1 CA 0 0 1 2 4
## 5 HOBBIES_1… HOBBIE… HOBBI… CA_1 CA 1 0 2 3 1
## 6 HOBBIES_1… HOBBIE… HOBBI… CA_1 CA 0 0 1 0 0
## # ℹ 23 more variables: d_1919 <dbl>, d_1920 <dbl>, d_1921 <dbl>, d_1922 <dbl>,
## # d_1923 <dbl>, d_1924 <dbl>, d_1925 <dbl>, d_1926 <dbl>, d_1927 <dbl>,
## # d_1928 <dbl>, d_1929 <dbl>, d_1930 <dbl>, d_1931 <dbl>, d_1932 <dbl>,
## # d_1933 <dbl>, d_1934 <dbl>, d_1935 <dbl>, d_1936 <dbl>, d_1937 <dbl>,
## # d_1938 <dbl>, d_1939 <dbl>, d_1940 <dbl>, d_1941 <dbl>
sell_prices = read_csv('../data/sell_prices.csv')
## Rows: 6841121 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): store_id, item_id
## dbl (2): wm_yr_wk, sell_price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sell_prices)
## # A tibble: 6 × 4
## store_id item_id wm_yr_wk sell_price
## <chr> <chr> <dbl> <dbl>
## 1 CA_1 HOBBIES_1_001 11325 9.58
## 2 CA_1 HOBBIES_1_001 11326 9.58
## 3 CA_1 HOBBIES_1_001 11327 8.26
## 4 CA_1 HOBBIES_1_001 11328 8.26
## 5 CA_1 HOBBIES_1_001 11329 8.26
## 6 CA_1 HOBBIES_1_001 11330 8.26
weights_validation = read_csv('../data/weights_validation.csv')
## Rows: 42840 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Level_id, Agg_Level_1, Agg_Level_2
## dbl (2): Dollar_Sales, weight
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(weights_validation)
## # A tibble: 6 × 5
## Level_id Agg_Level_1 Agg_Level_2 Dollar_Sales weight
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Level12 HOBBIES_1_001 CA_1 225. 0.00006
## 2 Level12 HOBBIES_1_002 CA_1 7.94 0.00000212
## 3 Level12 HOBBIES_1_003 CA_1 47.5 0.0000127
## 4 Level12 HOBBIES_1_004 CA_1 237. 0.0000631
## 5 Level12 HOBBIES_1_005 CA_1 109. 0.0000292
## 6 Level12 HOBBIES_1_006 CA_1 16.3 0.00000435
Data Aggregation
# Joined data into a single data frame
# Pivot single data frame so dates become rows
# Aggregate to category/day
# Make date format the same as calendar
pivoted_sales_train_validation <- sales_train_validation %>%
pivot_longer(
cols = starts_with("d_"), # Select columns starting with 'd_'
names_to = "day", # New column name for the pivoted columns
values_to = "value" # New column name for the values in the selected columns
) %>%
mutate(day = as.numeric(gsub("d_", "", day)))
grouped_data <- pivoted_sales_train_validation %>%
group_by(cat_id, day) %>%
summarize(total = sum(value))
## `summarise()` has grouped output by 'cat_id'. You can override using the
## `.groups` argument.
# Add day column to calendar data frame
calendar$day <- 1:nrow(calendar)
# left join data
grouped_data <- left_join(grouped_data, calendar, by = "day")
# Holiday list variables
influential_holidays = list(
#Thanksgiving 2011-2015
list(
type = "line",
x0 = as.Date("2011-11-24"), x1 = as.Date("2011-11-24"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "brown", dash = "dash") # Set line color and style
),
list(
type = "line",
x0 = as.Date("2012-11-22"), x1 = as.Date("2012-11-22"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "brown", dash = "dash") # Set line color and style
),
list(
type = "line",
x0 = as.Date("2013-11-28"), x1 = as.Date("2013-11-28"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "brown", dash = "dash") # Set line color and style
),
list(
type = "line",
x0 = as.Date("2014-11-27"), x1 = as.Date("2014-11-27"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "brown", dash = "dash") # Set line color and style
),
list(
type = "line",
x0 = as.Date("2015-11-26"), x1 = as.Date("2015-11-26"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "brown", dash = "dash") # Set line color and style
),
#Christmas 2011-2015
list(
type = "line",
x0 = as.Date("2011-12-25"), x1 = as.Date("2011-12-25"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "red", dash = "dash") # Set line color and style
),
list(
type = "line",
x0 = as.Date("2012-12-25"), x1 = as.Date("2012-12-25"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "red", dash = "dash") # Set line color and style
),
list(
type = "line",
x0 = as.Date("2013-12-25"), x1 = as.Date("2013-12-25"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "red", dash = "dash") # Set line color and style
),
list(
type = "line",
x0 = as.Date("2014-12-25"), x1 = as.Date("2014-12-25"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "red", dash = "dash") # Set line color and style
),
list(
type = "line",
x0 = as.Date("2015-12-25"), x1 = as.Date("2015-12-25"), # x-coordinate for the vertical line
y0 = 0, y1 = 40000 * 1.1, # Extend from y = 0 to y above the max value
line = list(color = "red", dash = "dash") # Set line color and style
)
)
# plot data
quantity_plot <- plot_ly(grouped_data, x = ~as.Date(date), y = ~total, color = ~cat_id, type='scatter', mode = 'lines') %>%
layout(title = "Walmart Product Data",
xaxis = list(title = "Date"),
yaxis = list(title = "Quantity"),
shapes = influential_holidays)
quantity_plot